the ranks will be a named num, with
entrezgene_id as name and stat (Wald Test) as
metric
getRanks <- function(res, annot) {
# only taking genes which have entrezgene_ids assigned to them
genes_with_entrez <- select(annot, GeneID, entrezgene_id) %>%
filter(!is.na(entrezgene_id))
ranks <- as.data.frame(res) %>%
tibble::rownames_to_column("GeneID") %>%
merge(genes_with_entrez, by = "GeneID") %>%
arrange(desc(stat)) %>%
select(entrezgene_id, stat) %>%
tibble::deframe() # creating a named num from two columns
return(ranks)
}
ranks.gastroc <- getRanks(res.gastroc, annot)
ranks.soleus <- getRanks(res.soleus, annot)
# TODO: why (again) is the soleus gene count seemingly 300 below gastroc gene count / ranks count
duplicate entrezgene ids (multiple entrez are mapped to the same gene name)
Thus a quick look if any of these genes can be simply omitted. Like if the gene_type is “other” or “tRNA”
# TODO: do not use annot, but the merge? (actually used genes!)
# first looking at ENSEMBL:
duplicate_ENSEMBL <- annot[duplicated(annot$GeneID), ] %>%
dplyr::group_by(GeneID, gene_biotype) %>%
summarise(n = n())
## `summarise()` has grouped output by 'GeneID'. You can override using the `.groups` argument.
# looking at gene_names(ext):
duplicate_geneNames <- annot[duplicated(annot$external_gene_name), ] %>%
dplyr::group_by(GeneID, gene_biotype) %>%
summarise(n = n())
## `summarise()` has grouped output by 'GeneID'. You can override using the `.groups` argument.
# duplicates have always the same gene_biotype! (in no more than one group occurs the same GeneID)
anyDuplicated(duplicate_ENSEMBL$GeneID) # 0
## [1] 0
anyDuplicated(duplicate_geneNames$GeneID) # 0
## [1] 0
# are all gene_name duplicates in ENS duplicates?
sum(duplicate_ENSEMBL$GeneID %in% duplicate_geneNames$GeneID) %>%
sprintf(
"%d / %d ENSEMBL duplications are also included in the duplicated gene_names",
.,
nrow(duplicate_ENSEMBL)
)
## [1] "173 / 173 ENSEMBL duplications are also included in the duplicated gene_names"
diff_ENS_gene <- nrow(duplicate_geneNames) - nrow(duplicate_ENSEMBL)
Since all ENSEMBL duplicates are also found in the gene_name duplicates, using the ENSEMBL as id for the ranks would reduce the number of duplicates by 107.
actually most of them are “protein coding” and will not be omitted.
Pathways are provided by http://www.gsea-msigdb.org/gsea/msigdb/mouse/collections.jsp
For now the Canonical pathways are used. These gene sets represent biological a biological process. They are composed from the following databases taking a subset of CP:
| database | gene sets |
|---|---|
| BioCarta | 252 |
| Reactome | 1249 |
| WikiPathways | 186 |
fgseaRes <- fgsea(
pathways = CGP,
stats = ranks,
minSize = 15,
maxSize = 200
)
ordering pathways by padj values and using ES to
# ' obtain top pathways ordered by padj and use `ES` for up or down regulation
get_top_pathways <- function(fgseaRes, up = TRUE, pCutoff=params$pCutoff, n=10) {
.updown <- ifelse(up, `>`, `<`)
top.pathways <- fgseaRes %>%
filter(.updown(ES,0), padj < pCutoff) %>%
arrange(padj) %>%
slice_head(n=n)
return(top.pathways)
}
plot for top up and down regulated pathways
# ' plots top n enrichment plots for the given fgsea result
plot_top_enrichment <- function(fgseaRes, pathways, ranks, n = 9, up = TRUE) {
# extracting the top n pathways
top.pathways <- get_top_pathways(fgseaRes, up=up, pCutoff=params$pCutoff, n=n)
plot.list <- list()
# lims <- list("x" = c(0,17000), "y" = c(-0.8,0.0))
for (i in 1:nrow(top.pathways)) {
# filling plot.list with enrichmentPlots
# TODO: how can I use facet_wrap for this?
pathway <- top.pathways[i]$pathway
plt <- plotEnrichment(pathways[[pathway]], ranks) +
# TODO: adjust yaxis to the same scale
# TODO: keep axis.text.x only on the lower row
# TODO: keep axis.text.y only on the right column
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank()
) # +
# coord_cartesian(xlim = lims$x, ylim = lims$y)
plot.list[[i]] <- plt
}
arrange_plts(plot.list)
}
# ' helper function to arragen the plot from the enrichment
arrange_plts <- function(plt.list) {
nplts <- length(plt.list)
plt <- plt.list[1]
xlab <- plt$labels$x
ylab <- plt$labels$y
# set axis to the same scale
lims <- list("x" = c(0, 17000), "y" = c(-0.8, 0.0))
# remove axis
# arrange the plots
fig_labels <- LETTERS[1:nplts]
patchwork::wrap_plots(plt.list, )
figure <- ggpubr::ggarrange(plotlist = plt.list,
labels = fig_labels) %>%
annotate_figure(left = text_grob(ylab, rot = 90),
bottom = text_grob(xlab))
# TODO: remove all x-axis labels except lower row
# get dimensions
figure$layers
return(figure)
}
# plot_labels <-
# data.frame("label" = LETTERS[1:10], "pathway" = top.pathways)
# knitr::kable(caption = "plot labels", plot_labels)
plot_top_enrichment(fgseaRes.gastroc, CGP, ranks.gastroc, up=T)
# TODO: add plot labels to return argument of plot_top_enrichment (use list probably)
plot_labels <-
data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.gastroc, up=T, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
| label | pathway |
|---|---|
| A | WP_TYROBP_CAUSAL_NETWORK_IN_MICROGLIA |
| B | WP_MICROGLIA_PATHOGEN_PHAGOCYTOSIS_PATHWAY |
| C | WP_APOPTOSIS |
| D | REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL |
| E | WP_FIBRIN_COMPLEMENT_RECEPTOR_3_SIGNALING_PATHWAY |
| F | WP_CHEMOKINE_SIGNALING_PATHWAY |
| G | BIOCARTA_TNFR2_PATHWAY |
| H | REACTOME_DAP12_INTERACTIONS |
| I | REACTOME_FCGAMMA_RECEPTOR_FCGR_DEPENDENT_PHAGOCYTOSIS |
plot_top_enrichment(fgseaRes.gastroc, CGP, ranks.gastroc, up=F)
plot_labels <-
data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.gastroc, up=F, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
| label | pathway |
|---|---|
| A | REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT |
| B | REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS |
| C | REACTOME_RESPIRATORY_ELECTRON_TRANSPORT |
| D | WP_ELECTRON_TRANSPORT_CHAIN |
| E | REACTOME_COMPLEX_I_BIOGENESIS |
| F | REACTOME_MITOCHONDRIAL_TRANSLATION |
| G | REACTOME_KEAP1_NFE2L2_PATHWAY |
| H | REACTOME_CELLULAR_RESPONSE_TO_HYPOXIA |
| I | REACTOME_CELLULAR_RESPONSE_TO_CHEMICAL_STRESS |
plot_top_enrichment(fgseaRes.soleus, CGP, ranks.soleus, up=T)
plot_labels <-
data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.soleus, up=T, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
| label | pathway |
|---|---|
| A | REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE |
| B | REACTOME_FORMATION_OF_A_POOL_OF_FREE_40S_SUBUNITS |
| C | REACTOME_NONSENSE_MEDIATED_DECAY_NMD_INDEPENDENT_OF_THE_EXON_JUNCTION_COMPLEX_EJC |
| D | WP_CYTOPLASMIC_RIBOSOMAL_PROTEINS |
| E | WP_TYROBP_CAUSAL_NETWORK_IN_MICROGLIA |
| F | REACTOME_EUKARYOTIC_TRANSLATION_INITIATION |
| G | REACTOME_NONSENSE_MEDIATED_DECAY_NMD |
| H | REACTOME_MAJOR_PATHWAY_OF_RRNA_PROCESSING_IN_THE_NUCLEOLUS_AND_CYTOSOL |
| I | REACTOME_PRC2_METHYLATES_HISTONES_AND_DNA |
plot_top_enrichment(fgseaRes.soleus, CGP, ranks.soleus, up=F)
plot_labels <-
data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.soleus, up=F, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
| label | pathway |
|---|---|
| A | REACTOME_KEAP1_NFE2L2_PATHWAY |
| B | REACTOME_CELLULAR_RESPONSE_TO_CHEMICAL_STRESS |
| C | REACTOME_GLI3_IS_PROCESSED_TO_GLI3R_BY_THE_PROTEASOME |
| D | REACTOME_UBIQUITIN_MEDIATED_DEGRADATION_OF_PHOSPHORYLATED_CDC25A |
| E | REACTOME_RUNX1_REGULATES_TRANSCRIPTION_OF_GENES_INVOLVED_IN_DIFFERENTIATION_OF_HSCS |
| F | REACTOME_CELLULAR_RESPONSE_TO_HYPOXIA |
| G | REACTOME_DEGRADATION_OF_DVL |
| H | REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT |
| I | REACTOME_ASYMMETRIC_LOCALIZATION_OF_PCP_PROTEINS |
top significant pathways:
# creating up and down regulated pathway vectors separately to maintain order
topUp <- get_top_pathways(fgseaRes.gastroc, up=T, pCutoff = params$pCutoff, n=10)
topDown <- get_top_pathways(fgseaRes.gastroc, up=F, pCutoff = params$pCutoff, n=10)
topPathways <- bind_rows(topUp, topDown) %>%
arrange(-NES) %>%
pull(pathway)
plotGseaTable(
pathways = CGP[topPathways],
stats = ranks.gastroc,
fgseaRes = fgseaRes.gastroc,
gseaParam = 0.5,
render = TRUE
) %>%
ggpubr::as_ggplot() # needed since, for whatever reason only `NULL` gets returned if `plotGseaTable` is rendered inline
top significant pathways:
# creating up and down regulated pathway vectors separately to maintain order
topUp <- get_top_pathways(fgseaRes.soleus, up=T, pCutoff = params$pCutoff, n=10)
topDown <- get_top_pathways(fgseaRes.soleus, up=F, pCutoff = params$pCutoff, n=10)
topPathways <- bind_rows(topUp, topDown) %>%
arrange(-NES) %>%
pull(pathway)
plotGseaTable(
pathways = CGP[topPathways],
stats = ranks.soleus,
fgseaRes = fgseaRes.soleus,
gseaParam = 0.5,
render = TRUE
) %>%
ggpubr::as_ggplot() # needed since, for whatever reason only `NULL` gets returned if `plotGseaTable` is rendered inline
using NES from the fgsea result filtering on the set
pCutoff=0.01 yields the following plot:
pCutoff <- params$pCutoff
res.combined <- merge(
data.frame(fgseaRes.gastroc[, c("pathway", "NES", "padj")]),
data.frame(fgseaRes.soleus[, c("pathway", "NES", "padj")]),
by = "pathway",
suffixes = c(".ga", ".sol")
) %>%
filter(padj.ga < pCutoff | padj.sol < pCutoff) %>%
mutate(
diff.exp = case_when(
NES.ga < 0 & NES.sol < 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "both down",
NES.ga > 0 & NES.sol > 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "both up",
NES.ga < 0 & NES.sol > 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "ga down, sol up",
NES.ga > 0 & NES.sol < 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "ga up, sol down",
NES.ga < 0 & padj.ga < pCutoff & padj.sol > pCutoff ~ "ga down",
NES.ga > 0 & padj.ga < pCutoff & padj.sol > pCutoff ~ "ga up",
NES.sol < 0 & padj.ga > pCutoff & padj.sol < pCutoff ~ "sol down",
NES.sol > 0 & padj.ga > pCutoff & padj.sol < pCutoff ~ "sol up",
TRUE ~ "different"
)
)
# final plot
p <- ggplot(res.combined, aes(x = NES.ga, y = NES.sol, text=pathway)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_point(aes(color = diff.exp)) +
# scale_color_manual(values = c("red", "chartreuse1", "bisque", "royalblue")) +
labs(x = "gastroc", y = "soleus") +
# ggrepel::geom_label_repel(max.overlaps = 20) +
ggtitle(label = "NES")
plotly::ggplotly(p, tooltip = "all")
ggplot(res.combined, aes(x = diff.exp)) +
geom_bar(aes(fill = diff.exp))
[ ] looking at duplicate entrezgene_ids
[ ] finding optimal maxSize (one sided curve)
[ ] find out biological meaning of significant pathways